Tfh.Map@meta.data$celltype <- factor(Tfh.Map@meta.data$celltype, levels = c("Tfh1", "Tfh2", "Tfh17", "PD1n"))
Tfh.Map@meta.data$donor_id <- factor(Tfh.Map@meta.data$donor_id, levels = c("donor0", "donor1", "donor2", "donor3"))
viol_cols_qc <- c("donor0"="#FFFFFF", "donor1"="#8F9A98", "donor2"="#DADEDD", "donor3"="#4A5251")
##Creating each plot
a <- VlnPlot(Tfh.Map, features = c("nCount_RNA"),
cols = viol_cols_qc, group.by = "celltype", split.by = "donor_id",
pt.size = -1, flip = FALSE)
## The default behaviour of split.by has changed.
## Separate violin plots are now plotted side-by-side.
## To restore the old behaviour of a single split violin,
## set split.plot = TRUE.
##
## This message will be shown once per session.
# Extract max y value from the plot
max_a <- ceiling(max(ggplot_build(a)$data[[1]]$y, na.rm = TRUE))
# Modify the plot to only show the max y tick
af <-a + scale_y_continuous(breaks = max_a, labels = max_a) +
theme(axis.ticks.y = element_blank(),
axis.text.y = element_text(size = 10))
## Scale for y is already present.
## Adding another scale for y, which will replace the existing scale.
b <- VlnPlot(Tfh.Map, features = c("nFeature_RNA"),
cols = viol_cols_qc, group.by = "celltype", split.by = "donor_id",
pt.size = -1, flip = FALSE)
# Extract max y value from the plot
max_b <- ceiling(max(ggplot_build(b)$data[[1]]$y, na.rm = TRUE))
# Modify the plot to only show the max y tick
bf <- b + scale_y_continuous(breaks = max_b, labels = max_b) +
theme(axis.ticks.y = element_blank(),
axis.text.y = element_text(size = 10))
## Scale for y is already present.
## Adding another scale for y, which will replace the existing scale.
c <- VlnPlot(Tfh.Map, features = c("percent.mt"),
cols = viol_cols_qc, group.by = "celltype", split.by = "donor_id",
pt.size = -1, flip = FALSE)
# Extract max y value from the plot
y_vals <- ggplot_build(c)$data[[1]]$y
max_y <- ceiling(max(y_vals, na.rm = TRUE))
# Modify the plot to only show the max y tick
cf <- c + scale_y_continuous(breaks = 10.21729, labels = max_y) +
theme(axis.ticks.y = element_blank(),
axis.text.y = element_text(size = 10))
## Scale for y is already present.
## Adding another scale for y, which will replace the existing scale.
final <- af/bf/cf
print(final)
Input can be found in zenodo https://zenodo.org/records/14847353 Raw data have been deposited in the NCBI data base under accession number GSE272939 https://www.ncbi.nlm.nih.gov/geo/query/acc.cgi?acc=GSE272939 and accession number GSE253661 https://www.ncbi.nlm.nih.gov/geo/query/acc.cgi?acc=GSE253661
# We use the for loop to execute two commands for each sample - (1) read in the count data (Read10X()) and (2) create the Seurat objects from the read in data (CreateSeuratObject()):
##Formating input
# Set working directory (customize this as needed)
# Get list of .gz files
file.names <- list.files(pattern = '\\.gz$', recursive = TRUE)
# Function to extract the part between first pair of underscores
extract_key <- function(filename) {
matches <- regmatches(filename, regexpr("_(.*?)_", filename, perl = TRUE))
key <- gsub("_", "", matches) # remove underscores
return(key)
}
# Extract keys
keys <- sapply(file.names, extract_key)
# Unique folder names based on extracted keys
unique.keys <- unique(keys)
# Create folders if they don't exist
sapply(unique.keys, function(key) {
if (!dir.exists(key)) dir.create(key)
})
# Copy files into their corresponding folders
for (i in seq_along(file.names)) {
original_path <- file.path(getwd(), file.names[i])
key <- keys[i]
# Extract new file name by removing prefix (up to second underscore)
new_name <- sub("^[^_]+_[^_]+_", "", basename(file.names[i]))
dest_path <- file.path(getwd(), key, new_name)
file.copy(original_path, dest_path, overwrite = TRUE)
}
# Create Seurat objects for each folder
seurat_objects <- list()
for (key in unique.keys) {
folder_path <- file.path(getwd(), key)
seurat_data <- Read10X(data.dir = folder_path)
seurat_obj <- CreateSeuratObject(counts = seurat_data,
min.cells = 3,
min.features = 200,
project = key)
seurat_objects[[key]] <- seurat_obj
assign(key, seurat_obj)
}
##Changing suffix to make the cells uniques
seurat_objects$PD1Neg@meta.data$cellname <- colnames(seurat_objects$PD1Neg)
seurat_objects$Tfh1@meta.data$cellname <- colnames(seurat_objects$Tfh1)
seurat_objects$Tfh2@meta.data$cellname <- colnames(seurat_objects$Tfh2)
seurat_objects$Tfh17@meta.data$cellname <- colnames(seurat_objects$Tfh17)
seurat_objects$PD1Neg@meta.data$cellname <- gsub("*-1", "-1", seurat_objects$PD1Neg@meta.data$cellname)
seurat_objects$Tfh1@meta.data$cellname <- gsub("*-1", "-2", seurat_objects$Tfh1@meta.data$cellname)
seurat_objects$Tfh2@meta.data$cellname <- gsub("*-1", "-3", seurat_objects$Tfh2@meta.data$cellname)
seurat_objects$Tfh17@meta.data$cellname <- gsub("*-1", "-4", seurat_objects$Tfh17@meta.data$cellname)
##adding information as character
vector1 <- seurat_objects$PD1Neg@meta.data$cellname
vector2 <- seurat_objects$Tfh1@meta.data$cellname
vector3 <- seurat_objects$Tfh2@meta.data$cellname
vector4 <- seurat_objects$Tfh17@meta.data$cellname
##Renaming cells in the object
seurat_objects$PD1Neg <- RenameCells(seurat_objects$PD1Neg, new.names = vector1)
seurat_objects$Tfh1 <- RenameCells(seurat_objects$Tfh1, new.names = vector2)
seurat_objects$Tfh2 <- RenameCells(seurat_objects$Tfh2, new.names = vector3)
seurat_objects$Tfh17 <- RenameCells(seurat_objects$Tfh17, new.names = vector4)
##Renaming rows in metadata
rownames(seurat_objects$PD1Neg@meta.data) <- seurat_objects$PD1Neg@meta.data$cellname
rownames(seurat_objects$Tfh1@meta.data) <- seurat_objects$Tfh1@meta.data$cellname
rownames(seurat_objects$Tfh2@meta.data) <- seurat_objects$Tfh2@meta.data$cellname
rownames(seurat_objects$Tfh17@meta.data) <- seurat_objects$Tfh17@meta.data$cellname
merged_seurat <- merge(x = seurat_objects$PD1Neg,
y = c(seurat_objects$Tfh1,seurat_objects$Tfh2,seurat_objects$Tfh17))
##reading donor information
donor.ids <- read.csv("/working_groups/boylelab/shared/ZulyPava/Tfh_diversity_GEX_4.4/input/donor_ids.csv")
colnames(donor.ids)
##Changing the colname in donor_ids
donor_ids <- donor.ids %>%
plyr::rename(c(
"cell" = "cellname",
"donor_id" = "vireo_donor_id",
"prob_max" = "vireo_prob_max",
"prob_doublet" = "vireo_prob_doublet",
"n_vars" = "vireo_n_vars",
"best_singlet" = "vireo_best_singlet",
"best_doublet" = "vireo_best_doublet"))
head(donor_ids)
#91347 7
#when you join the donor_id information into the Meta.data it overwrites the rownames in the meta.data and colnames in the object. These are needed to map!!
##storing cellname information (i.e. object's colname and metadata rownames)
merged_seurat@meta.data$cellname <- rownames(merged_seurat@meta.data)
vectorrux <- colnames(merged_seurat)
##adding donor information
merged_seurat@meta.data <- plyr::join(merged_seurat@meta.data, donor_ids, by = "cellname")
##checking the information was added
head(merged_seurat@meta.data)
##add the rownames back to the metadata
rownames(merged_seurat@meta.data) <- merged_seurat@meta.data$cellname
##add the colnames back to object columns
merged_seurat <- RenameCells(merged_seurat, new.names = vectorrux)
#checking
tail(colnames(merged_seurat))
tail(rownames(merged_seurat@meta.data))
table(merged_seurat@meta.data$vireo_donor_id, useNA = "ifany")
##rownames are back
#Calculate per mito
# The [[ operator can add columns to object metadata. This is a great place to stash QC stats
merged_seurat[["percent.mt"]] <- PercentageFeatureSet(merged_seurat, pattern = "^MT-")
merged_seurat$celltype_donor<- paste0(merged_seurat$orig.ident, "_", merged_seurat$vireo_donor_id)
##save RDS file
saveRDS(merged_seurat, file = "/working_groups/boylelab/shared/ZulyPava/Tfh_diversity_GEX_4.4/input/Thf_diversity_merged_raw.rds")
merged_seurat <- readRDS("/working_groups/boylelab/shared/ZulyPava/Tfh_diversity_GEX_4.4/input/Thf_diversity_merged_raw.rds")
####Supplementary Figure 1 C Dimplot pre integration
merged_seurat_s <- subset(merged_seurat, vireo_donor_id!= "doublet")
# Get gene names
gene_list <- rownames(merged_seurat_s)
# Find TCR gene indices using regex
tcr_genes <- grep("^TRA[VJD]|^TRB[VJD]", gene_list, value = TRUE)
# Check how many are found
cat("TCR genes found:", length(tcr_genes), "\n")
## TCR genes found: 96
#96 found
# Remove TCR genes from the object
merged_seurat_s_f <- subset(merged_seurat_s, features = setdiff(gene_list, tcr_genes))##all rows in gene_list that arent in tcr_genes
##Log Norm
merged_seurat_s_f <- NormalizeData(merged_seurat_s_f)
## Normalizing layer: counts.PD1Neg
## Normalizing layer: counts.Tfh1
## Normalizing layer: counts.Tfh2
## Normalizing layer: counts.Tfh17
merged_seurat_s_f <- ScaleData(merged_seurat_s_f)
## Centering and scaling data matrix
merged_seurat_s_f <- FindVariableFeatures(merged_seurat_s_f)
## Finding variable features for layer counts.PD1Neg
## Finding variable features for layer counts.Tfh1
## Finding variable features for layer counts.Tfh2
## Finding variable features for layer counts.Tfh17
merged_seurat_s_f <- RunPCA(merged_seurat_s_f, npcs = 30, verbose = FALSE)
ElbowPlot(merged_seurat_s_f, ndims = 30)## we'll go with 30.
merged_seurat_s_f <- RunUMAP(merged_seurat_s_f, reduction = "pca", dims = 1:30)
## 16:40:26 UMAP embedding parameters a = 0.9922 b = 1.112
## 16:40:26 Read 7373 rows and found 30 numeric columns
## 16:40:26 Using Annoy for neighbor search, n_neighbors = 30
## 16:40:26 Building Annoy index with metric = cosine, n_trees = 50
## 0% 10 20 30 40 50 60 70 80 90 100%
## [----|----|----|----|----|----|----|----|----|----|
## **************************************************|
## 16:40:27 Writing NN index file to temp file /tmp/RtmpfR1vt6/file2e056c497f917f
## 16:40:27 Searching Annoy index using 1 thread, search_k = 3000
## 16:40:29 Annoy recall = 100%
## 16:40:30 Commencing smooth kNN distance calibration using 1 thread with target n_neighbors = 30
## 16:40:31 Initializing from normalized Laplacian + noise (using RSpectra)
## 16:40:31 Commencing optimization for 500 epochs, with 310870 positive edges
## 16:40:31 Using rng type: pcg
## 16:40:40 Optimization finished
merged_seurat_s_f <- FindNeighbors(merged_seurat_s_f, reduction = "pca", dims = 1:30)
## Computing nearest neighbor graph
## Computing SNN
donor_cols = c("donor0"="#89D9C9", "donor1" = "#78BBDA", "donor2"="#BDC1E0", "donor3"="#FF7C71")
Idents(object = merged_seurat_s_f) <- "vireo_donor_id"
top <- DimPlot(merged_seurat_s_f, pt.size = 2, group.by = "vireo_donor_id",
cols = c("#89D9C9", "#78BBDA", "#BDC1E0", "#FF7C71"))
Idents(object = Tfh.Map) <- "donor_id"
bottom <- DimPlot(Tfh.Map, label = FALSE, pt.size = 2, group.by = "donor_id",
cols = c("#89D9C9", "#78BBDA", "#BDC1E0", "#FF7C71" ))
print(top/bottom)
#### Supplementary Figure 1 D The data was downloaded from https://zenodo.org/records/11355186 publication is
Massoni-Badosa R,.. et al. An atlas of cells in the human tonsil.
Immunity. 2024 Feb 13;57(2):379-399.e18. doi:
10.1016/j.immuni.2024.01.006. Epub 2024 Jan 31. PMID: 38301653; PMCID:
PMC10869140.
topR <- DimPlot_scCustom(seurat_object = tonsil.tfh, group.by = "annotation_20230508", pt.size = 2, label = T,
colors_use = DiscretePalette_scCustomize(num_colors = 15, palette = "ditto_seq"))
topL <- DimPlot_scCustom(seurat_object = tonsil.tfh, group.by = "customclassif", pt.size = 2, label = T,colors_use = c("#1B685E","#AEB2D8","#C9812B", "#914F1F", "#464BA7"))
metadata <- tonsil.tfh@meta.data
# Define custom colors
colors_use <- c("#fdbf6f", "#a6cee3", "#41ae76", "#fff7bc", "#2171b5", "#fc4e2a")
# Count cells within each customclassif and annotation_20230508 group
metadata_summary <- metadata %>%
count(customclassif, annotation_20230508) # Summarize counts
bottom <- ggplot(metadata_summary, aes(x = customclassif, y = n, fill = annotation_20230508)) +
geom_bar(stat = "identity", position = "stack") + # Stacked bar for counts
labs(x = "Custom Classification", y = "Cell Count", fill = "Annotation") +
scale_fill_manual(values = colors_use) +
theme_minimal() +
theme(axis.text.x = element_text(angle = 45, hjust = 1)) # Rotate x-axis labels for clarity
print((topR +topL) /bottom)
human_colors_list <- c("#9e0142", "#f4a582", "#5e4fa2", "#66c2a5", "#e6f598",
"#74add1", "#8c510a", "#bf812d", "#fee090", "#01665e", "#b2abd2")
Tfh.ref.markers <- c("EOMES", "RUNX3","NKG7",
"TGFB1", "TOX", "SH2D1A", "PDCD1",
"TOX2", "MAF", "TCF7",
"LGALS3", "TNFRSF18", "BHLHE40", "CCR4", "ID2")
Stacked_VlnPlot(Tfh.Map, features = Tfh.ref.markers, x_lab_rotate = TRUE,
colors_use = human_colors_list, group.by = "integratedcluster")
#### Supplementary Figure 1 F | Relationship between cell sorted
identity and transitional cell cluster identify.
all_metadata <- data.frame(Tfh.Map@meta.data)
prop_table <- as.data.frame(table(all_metadata$celltype, all_metadata$integratedcluster))
colnames(prop_table)[1] <- "Sort_Cell_Identity"
colnames(prop_table)[2] <- "scRNAseq_clusters"
# Convert to factor and keep level order
# Rename the Sort_Cell_Identity labels by adding an underscore
prop_table$Sort_Cell_Identity <- gsub("^Tfh(\\d+)$", "Tfh_\\1", prop_table$Sort_Cell_Identity)
# Set the desired order with updated labels
prop_table$Sort_Cell_Identity <- factor(prop_table$Sort_Cell_Identity,
levels = c("Tfh_1", "Tfh_2", "Tfh_17", "PD1n"))
prop_table$scRNAseq_clusters <- factor(prop_table$scRNAseq_clusters,
levels = c("Tfh1", "Tfh2", "Tfh17", "Tfh1_CM", "Tfh1.17",
"Tfreg", "IFNI", "heatshock", "ribosomal", "cluster2", "cluster3"))
# Define named color vectors (order matters!)
colors_forthis = c(
"Tfh_1" = "#EC6E6E",
"Tfh_2" = "#5EA5D1",
"Tfh_17" = "#60B275",
"PD1n" = "#D0BFDF",
"Tfh1" = "#A62241",
"Tfh2" = "#464BA7",
"Tfh17" = "#64CBA9",
"Tfh1_CM" = "#FFA582",
"Tfh1.17" = "#E5EAA0",
"Tfreg" = "#61A1BE",
"IFNI" = "#914F1F",
"heatshock" = "#C9812B",
"ribosomal" = "#FFDF94",
"cluster2" = "#1B685E",
"cluster3" = "#AEB2D8")
ggplot(data = prop_table,
aes(axis1 = Sort_Cell_Identity, axis2 = scRNAseq_clusters, y = Freq)) +
scale_x_discrete(limits = c("Sort_Cell_Identity", "scRNAseq_clusters"), expand = c(.2, .05)) +
geom_alluvium(aes(fill = Sort_Cell_Identity)) +
geom_stratum(aes(stratum = after_stat(stratum), fill = after_stat(stratum)), width = 1/4) +
geom_text(stat = "stratum", aes(label = after_stat(stratum)), size = 3) +
scale_fill_manual(values = colors_forthis) +
theme_minimal()
df <- read.csv("/working_groups/boylelab/shared/ZulyPava/Tfh_diversity_finalcode/input/TfhMap_ClusterMarker_Pathways_190624.csv", sep =',', header = TRUE)
### removed NaN and 0 Z-score
df$direction <- ifelse(df$z.score > 0, "upregulated", "downregulated")
Tfh_colors <- c("#9e0142", "#f4a582", "#5e4fa2", "#66c2a5", "#bf812d")
### plot top 25 upregulated for Tfh1 + Tfh1CM + Tfh2 + Tfh17
df.1 <- df[df$CellType == "Tfh1",]
df.1 <- df.1[1:25,]
df.2 <- df[df$CellType == "Tfh1_CM",]
df.2 <- df.2[1:25,]
df.3 <- df[df$CellType == "Tfh2",]
df.3 <- df.3[1:25,]
df.4<- df[df$CellType == "Tfh17",]
df.4 <- df.4[1:25,]
#merge
C2 <- rbind(df.1, df.2, df.3, df.4)
## getting just Tfh1 and Tfh1CM (16 Feb 2025)
C2sub <- C2[C2$CellType %in% c("Tfh1", "Tfh1_CM"), ]
C2_max <- C2sub %>%
group_by(Ingenuity.Canonical.Pathways) %>%
summarise(max_zscore = max(z.score))
C2_merged <- left_join(C2sub, C2_max, by = "Ingenuity.Canonical.Pathways")
ggplot(C2_merged, aes(x = reorder(Ingenuity.Canonical.Pathways, max_zscore), y = z.score, fill = CellType, group = CellType)) +
geom_bar(stat = "identity", alpha = .6, position = "dodge", width = 0.8) +
scale_fill_manual(values = Tfh_colors, name = "celltype") +
coord_flip() +
xlab("") +
scale_y_continuous(position = "right", limits = c(0, 5)) +
theme_cowplot(12) +
theme(axis.text.y = element_text(size = 10))
Input data, can be downloaded from the NCBI data base under accession number GSE272939 https://www.ncbi.nlm.nih.gov/geo/query/acc.cgi?acc=GSE272939
setwd(dirname(rstudioapi::getActiveDocumentContext()$path)) # This only works if you are using RStudio
file.names <- list.files(pattern = 'filtered_contig_annotations.csv', recursive = TRUE)
names <- list.files(path="VDJ_Mode")
file_seq <- 1:length(file.names)
if(any(grepl("*.csv", file.names))==TRUE){
tfh_tcr <- file.names %>%
lapply(., function(i){
fread(input = i)
}) %>%
setNames(names)
}
tfh_tcr <- combineTCR(tfh_tcr,
samples = names,
removeNA = TRUE,
removeMulti = TRUE,
filterMulti = TRUE)
## Now we need to edit the barcode, delete everything before "v3_"
tfh_tcr <- tfh_tcr %>%
purrr::map(~.x %>% mutate(barcode = gsub(".*v3_","",barcode)))
tfh_tcr <- tfh_tcr[c("Tfh1_vdjMode_BoyleLab_vdj_ref_v3","Tfh2_vdjMode_BoyleLab_vdj_ref_v3",
"Tfh17_vdjMode_BoyleLab_vdj_ref_v3","PD1Neg_vdjMode_BoyleLab_vdj_ref_v3")]
tfh_tcr <- tfh_tcr %>%
purrr::map2(., file_seq, ~mutate(.x, barcode = sub(1, .y, barcode)))
tfh_tcr_db0 <- lapply(names(tfh_tcr), function(name) {
df <- tfh_tcr[[name]]
df$day <- name
colnames(df)[1] <- "barcode"
colnames(df)[2] <- "sample"
colnames(df)[3] <- "TCR1"
colnames(df)[4] <- "cdr3_aa1"
colnames(df)[5] <- "cdr3_nt1"
colnames(df)[6] <- "TCR2"
colnames(df)[7] <- "cdr3_aa2"
colnames(df)[8] <- "cdr3_nt2"
colnames(df)[9] <- "CTgene"
colnames(df)[10] <- "CTnt"
colnames(df)[11] <- "CTaa"
colnames(df)[12] <- "CTstrict"
return(df)
})
tfh_tcr_db = as.data.frame(do.call(rbind, tfh_tcr_db0))
#write.csv(tfh_tcr_db, "/working_groups/boylelab/shared/ZulyPava/Tfh_diversity_finalcode/input/tfh_tcr_db_30325.csv")
#Combining expression and VDJ output
tfh_rep <- combineExpression(tfh_tcr, Tfh.Map,
cloneCall="aa",
proportion = FALSE,
cloneSize=c(Single=1, Small=5, Medium=20, Large=100, Hyperexpanded=500))
tfh_rep@meta.data$integratedcluster <- factor(tfh_rep@meta.data$integratedcluster,
levels = c("Tfh1", "Tfh1_CM", "Tfh2", "Tfh17", "Tfh1.17", "Tfreg", "IFNI",
"heatshock", "ribosomal", "cluster2", "cluster3"))
cluster_col <- c("Tfh1" = "#9e0142", "Tfh1_CM" = "#f4a582",
"Tfh2" = "#5e4fa2", "Tfh17" = "#66c2a5",
"Tfh1.17" = "#e6f598", "Tfreg" = "#74add1",
"IFNI" = "#8c510a", "heatshock" = "#bf812d",
"ribosomal"= "#fee090","cluster2" = "#01665e", "cluster3" = "#b2abd2")
cluster_orders <- c("Tfh1", "Tfh1_CM", "Tfh2", "Tfh17", "Tfh1.17", "Tfreg", "IFNI", "heatshock", "ribosomal", "cluster2", "cluster3")
cell.prop <- tfh_rep@meta.data %>%
mutate(TCR = case_when(is.na(CTaa) == TRUE ~ "No",
TRUE ~ "Yes"))
cell.prop <- cell.prop %>%
group_by(cloneSize) %>%
summarise(n = n())
clone_order <- rev(mixedsort(unique(filter(tfh_rep@meta.data, !is.na(cloneType))$cloneType)))
tfh_rep@meta.data <- tfh_rep@meta.data %>%
mutate(cloneSize = case_when(clonalFrequency == 1 ~ "1",
clonalFrequency == 2 ~ "2",
clonalFrequency == 3 ~ "3",
clonalFrequency == 4 ~ "4",
clonalFrequency > 5 ~ ">5")) %>%
mutate(cloneSize = factor(cloneSize, levels = c('1','2','3','4','>5')))
ggplot(filter(tfh_rep@meta.data, !is.na(cloneSize)),
aes(x = integratedcluster, fill = cloneSize))+
geom_bar(colour = "black", position = position_stack(reverse = TRUE))+
theme_classic()+
scale_fill_manual(values = c("grey","#ABDDA4", "#66C2A5", "#3288BD", "#5E4FA2"))+
labs(y = "Count")+
theme(axis.title.x = element_blank(),
axis.text.x = element_text(angle=90))
## Removing singletons
tfh_rep_s <- subset(tfh_rep, subset = clonalFrequency != 1)
No clones were shared across donors. Expanded clonotypes (n³2) with only same fates (exist within the same subset), or different fates are shown.
# Create a new column donor_clust for ChordDiagram "sector"
tfh_rep_s@meta.data$donor_clust <- paste(tfh_rep_s@meta.data$donor_info, tfh_rep_s@meta.data$integratedcluster, sep="_")
# Create circos df using getCirclize function
tfh_circos <- getCirclize(tfh_rep_s, group.by = "donor_clust") %>%
filter(., value!=0)
# Metadata on naming the group
df_names <- tfh_rep_s@meta.data %>%
select(integratedcluster, donor_info, donor_clust) %>%
distinct(donor_clust, .keep_all = TRUE) %>%
remove_rownames() %>%
arrange(., factor(integratedcluster))
# We need to match color donor_clust variables based on cell type
# Currently done manually, need to find easier way
names <- unique(c(tfh_circos$from, tfh_circos$to))
grid.col <- case_when(grepl("_Tfh1$", names) ~ "#9e0142",
grepl("_Tfh1_CM$", names) ~ "#f4a582",
grepl("Tfh2", names) ~ "#5e4fa2",
grepl("Tfh17", names) ~ "#66c2a5",
grepl("Tfh1.17", names) ~ "#e6f598",
grepl("Tfreg", names) ~ "#74add1",
grepl("IFNI", names) ~ "#8c510a",
grepl("heatshock", names) ~ "#bf812d",
grepl("ribosomal", names) ~ "#fee090",
grepl("_cluster2$", names) ~ "#01665e",
grepl("_cluster3$", names) ~ "#b2abd2",
TRUE ~ NA)
names(grid.col) <- names
# We want to split the circos based on donor grouping
group = structure(df_names$donor_info, names = df_names$donor_clust)
group = factor(group, levels = c("donor0", "donor1", "donor2","donor3"))
# Now plot backbone of circos
chordDiagram(tfh_circos,
grid.col = grid.col,
annotationTrack = "grid",
preAllocateTracks = list(list(track.height = 0.075),
# # list(track.height = 0.075),
list(track.height = 0.001)),
annotationTrackHeight = mm_h(5),
group = group,
big.gap = 20, small.gap = 1,
direction.type = c("diffHeight", "arrows"))
# Now manually name the inner sector based on cell type
names <- data.frame(donor_clust = get.all.sector.index()) %>%
left_join(., df_names, by = "donor_clust")
names <- names$integratedcluster
for(i in seq_along(names)){
si <- get.all.sector.index()[i]
nm <- names[i]
xlim = get.cell.meta.data("xlim", sector.index = si, track.index = 2)
ylim = get.cell.meta.data("ylim", sector.index = si, track.index = 2)
circos.text(mean(xlim), mean(ylim), nm, sector.index = si, track.index = 2,
facing = "outside", niceFacing = TRUE, col = "black", cex = 0.5)
}
highlight.sector(filter(df_names, donor_info == 'donor0')$donor_clust, track.index = 1, border = "black", col = "white", text = "Donor 0", cex = 0.75, text.col = "black", niceFacing = TRUE)
highlight.sector(filter(df_names, donor_info == 'donor1')$donor_clust, track.index = 1, border = "black", col = "white", text = "Donor 1", cex = 0.75, text.col = "black", niceFacing = TRUE)
highlight.sector(filter(df_names, donor_info == 'donor2')$donor_clust, track.index = 1, border = "black", col = "white", text = "Donor 2", cex = 0.75, text.col = "black", niceFacing = TRUE)
highlight.sector(filter(df_names, donor_info == 'donor3')$donor_clust, track.index = 1, border = "black", col = "white", text = "Donor 3", cex = 0.75, text.col = "black", niceFacing = TRUE)
circos.clear()
dev.set()
## png
## 2
Tfh.IBSM <- readRDS("/working_groups/boylelab/shared/ZulyPava/Tfh_diversity_finalcode/input/Tfh.IBSM.2_REANNOTATION_POSTNORM_V2_200724.rds")
DefaultAssay(Tfh.IBSM) <- "integrated"
# remove cluster 13, 30, 24, 29, 23, 17 bc they are low quality clusters
Tfh.IBSM.2 <- subset(Tfh.IBSM, subset = integrated_snn_res.2.2 %in% c(0,1,2,3,4,5,6,7,8,9,10,11,12,14,15,16,18,19,20,21,22,25,26,27,28))
Tfh.IBSM@meta.data$day <- factor(Tfh.IBSM@meta.data$day, levels = c("day0", "day8", "day16", "day36"))
Tfh.IBSM@meta.data$donor_id <- factor(Tfh.IBSM@meta.data$donor_id, levels = c("donor0", "donor1", "donor2", "donor3"))
viol_cols_QC <- c("donor0"="#FFFFFF", "donor1"="#8F9A98", "donor2"="#DADEDD", "donor3"="#4A5251")
##Creating each plot
A <- VlnPlot(Tfh.IBSM, features = c("nCount_RNA"),
cols = viol_cols_QC, group.by = "day", split.by = "donor_id",
pt.size = -1, flip = FALSE)
# Extract max y value from the plot
max_A <- ceiling(max(ggplot_build(A)$data[[1]]$y, na.rm = TRUE))
# Modify the plot to only show the max y tick
Af <-A + scale_y_continuous(breaks = max_A, labels = max_A) +
theme(axis.ticks.y = element_blank(),
axis.text.y = element_text(size = 10))
## Scale for y is already present.
## Adding another scale for y, which will replace the existing scale.
B <- VlnPlot(Tfh.IBSM, features = c("nFeature_RNA"),
cols = viol_cols_QC, group.by = "day", split.by = "donor_id",
pt.size = -1, flip = FALSE)
# Extract max y value from the plot
max_B <- ceiling(max(ggplot_build(B)$data[[1]]$y, na.rm = TRUE))
# Modify the plot to only show the max y tick
Bf <- B + scale_y_continuous(breaks = max_B, labels = max_B) +
theme(axis.ticks.y = element_blank(),
axis.text.y = element_text(size = 10))
## Scale for y is already present.
## Adding another scale for y, which will replace the existing scale.
C <- VlnPlot(Tfh.IBSM, features = c("percent.mt"),
cols = viol_cols_QC, group.by = "day", split.by = "donor_id",
pt.size = -1, flip = FALSE)
# Extract max y value from the plot
Y_vals <- ggplot_build(C)$data[[1]]$y
max_Y <- ceiling(max(Y_vals, na.rm = TRUE))
# Modify the plot to only show the max y tick
Cf <- C + scale_y_continuous(breaks = max_Y, labels = max_Y) +
theme(axis.ticks.y = element_blank(),
axis.text.y = element_text(size = 10))
## Scale for y is already present.
## Adding another scale for y, which will replace the existing scale.
Final <- Af/Bf/Cf
print(Final)
right4d <- DimPlot_scCustom(seurat_object = Tfh.IBSM, group.by = "donor_day", pt.size = 2, colors_use = DiscretePalette_scCustomize(num_colors = 16,palette = "stepped"))
print(right4d)
DimPlot_scCustom(seurat_object = Tfh.IBSM, group.by = "integrated_snn_res.2.2", pt.size = 2, label = T,colors_use = DiscretePalette_scCustomize(num_colors = 31,palette = "varibow"))
###### scType label.....
# load gene set preparation function
source("https://raw.githubusercontent.com/IanevskiAleksandr/sc-type/master/R/gene_sets_prepare.R")
# load cell type annotation function
source("https://raw.githubusercontent.com/IanevskiAleksandr/sc-type/master/R/sctype_score_.R")
db_ = "/working_groups/boylelab/shared/ZulyPava/Tfh_diversity_finalcode/input/TfhMap_ClusterMarkers_DB_Reannotated_postnorm_190724.xlsx";
tissue = "Immune system"
# prepare gene sets
gs_list = gene_sets_prepare(db_, tissue)
# get cell-type by cell matrix
es.max = sctype_score(scRNAseqData = Tfh.IBSM.2[["integrated"]]@scale.data, scaled = TRUE,
gs = gs_list$gs_positive, gs2 = gs_list$gs_negative)
#write.csv(es.max, "TfhIBSM_scType_esmax_scoringpercell_310524.csv")
# merge by cluster
cL_resutls = do.call("rbind", lapply(unique(Tfh.IBSM.2@meta.data$integrated_snn_res.2.2), function(cl){
es.max.cl = sort(rowSums(es.max[ ,rownames(Tfh.IBSM.2@meta.data[Tfh.IBSM.2@meta.data$integrated_snn_res.2.2==cl, ])]), decreasing = !0)
head(data.frame(cluster = cl, type = names(es.max.cl), scores = es.max.cl, ncells = sum(Tfh.IBSM.2@meta.data$integrated_snn_res.2.2==cl)), 10)
}))
sctype_scores = cL_resutls %>% group_by(cluster) %>% top_n(n = 1, wt = scores)
# set low-confident (low ScType score) clusters to "unknown"
sctype_scores$type[as.numeric(as.character(sctype_scores$scores)) < sctype_scores$ncells/4] = "Unknown"
Tfh.IBSM.2@meta.data$customclassif = ""
for(j in unique(sctype_scores$cluster)){
cl_type = sctype_scores[sctype_scores$cluster==j,];
Tfh.IBSM.2@meta.data$customclassif[Tfh.IBSM.2@meta.data$integrated_snn_res.2.2 == j] = as.character(cl_type$type[1])
}
# write.csv(cL_resutls, "TfhIBSM_scType_cL_results_postnorm_200724.csv")
# write.csv(sctype_scores, "TfhIBSM_scType_sctype_scores_postnorm_200724.csv")
####### plotting sctype scores....
#cL_df <- cL_resutls
#cL_df <- read.csv("TfhIBSM_scType_cL_results_postnorm_200724.csv")
cL_df <- read.csv("/working_groups/boylelab/shared/ZulyPava/Tfh_diversity_GEX_4.4/TfhIBSM_scType_clresults_scoringpercluster_310524.csv")
#cL_df <- cL_df[, -c(1,5)]
cL_df$type <- factor(cL_df$type, levels = c("Tfh1", "Tfh1_CM", "Tfh2", "Tfh17",
"IFNI", "heatshock", "ribo", "cluster2", "cluster3"))
Tfh_colors_list <- c("#9e0142", "#f4a582", "#5e4fa2", "#66c2a5",
"#8c510a", "#bf812d", "#fee090", "#01665e", "#b2abd2")
####Supplementary Figure 4F
number_ticks <- function(n) {function(limits) pretty(limits, n)}
# ##"TfhIBSM_scType_output_cL_results_barplot_postnorm_200724.pdf", h = 6, w = 8) |incomplete cannot add the low confidence information with current r version. probably need to move after confidence calculation. the read file approaches real plot than calculated one.
ggplot(cL_df, aes(fill=type, y=scores, x=cluster)) +
scale_fill_manual(values =Tfh_colors_list) +
geom_bar(position="stack", stat="identity") +
scale_x_continuous(breaks = 0:29)+
theme_classic()
#### Supplementary Figure 4 H /| Final predicted label for each cell
cluster
sctype_scores$type <- factor(sctype_scores$type, levels = c("Tfh1", "Tfh1_CM", "Tfh2", "Tfh17",
"IFNI", "heatshock", "ribo", "cluster2", "cluster3"))
##Supplementary Figure 4HFinal predicted label for each cell cluster.
##|wrong CLUSTER 14 (Tfh2 instead of Tfh1_CM), 19 (Tfh17 instead of cluster2) AND 28 not present in final plot
##pdf("TfhIBSM_scType_output_scType_scores_barplot_postnorm_200724.pdf", h = 6, w = 8)
ggplot(sctype_scores, aes(x=cluster, y=scores, fill=type)) +
scale_fill_manual(values =Tfh_colors_list) +
geom_bar(stat="identity")+theme_classic()
# ###### adding some confidence...
# # Calculate the highest score for each cluster
# max_scores <- cL_df %>%
# group_by(cluster) %>%
# summarize(max_score = max(scores))
#
# # Join the max scores back to the original data
# data_with_max <- cL_df %>%
# left_join(max_scores, by = "cluster")
#
#
# # Calculate the fold change of each score over the maximum score for each cluster
# cL_df2 <- data_with_max %>%
# mutate(confidence = max_score / scores)
#
# # Flag each row based on confidence
# cL_df2 <- cL_df2 %>%
# mutate(flag_stripes = case_when(
# confidence < 0 ~ "normal", # Flag negative values as "normal"
# confidence == 1.0 ~ "choice", # Flag exactly 1.0000 as "choice"
# confidence < 1.1 ~ "low", # Flag values less than 1.1 (excluding 1.0000) as "low"
# TRUE ~ "normal" # Flag all other values as "normal"
# ))
#
# # Identify clusters with "low" flags
# clusters_with_low <- cL_df2 %>%
# filter(flag_stripes == "low") %>%
# group_by(cluster) %>%
# summarize(num_low_flags = n()) %>%
# ungroup()
#
# # Add a column to indicate low confidence (you can customize the condition)
# cL_df2$low_confidence <- cL_df2$flag_stripes == "low"
#
# #### Supplementary Figure 4F) scType was used to label transfer cell signatures (top 20 up and down regulated genes)70
# # for each cell subset identified in healthy ‘map’ data onto over-clustered CHMI data. Each cell cluster71
# # was scored for each reference signature (left), and the majority score (right) used to collapse clusters72
# # into cell subsets.
# ##"TfhIBSM_scType_output_cL_results_barplot_postnorm_WITHCONFIDENCE_200724.pdf", h = 6, w = 8)
# # ggplot(cL_df2, aes(y = scores, x = cluster, fill = type, pattern = low_confidence)) +
# # geom_bar_pattern(position = "stack", stat = "identity",
# # pattern_fill = "black",
# # pattern_angle = 45,
# # pattern_density = 0.1,
# # pattern_spacing = 0.02) +
# # scale_fill_manual(values = Tfh_colors_list) +
# # scale_pattern_manual(values = c(`TRUE` = "stripe", `FALSE` = "none")) +
# # theme_classic()
#
# # ggplot(cL_df2, aes(y = scores, x = cluster, fill = type, pattern = low_confidence)) +
# # geom_bar_pattern(position = "stack", stat = "identity",
# # pattern_fill = "black",
# # pattern_angle = 45,
# # pattern_density = 0.1,
# # pattern_spacing = 0.02) +
# # scale_fill_manual(values = Tfh_colors_list) +
# # scale_pattern_manual(values = c(`TRUE` = "stripe", `FALSE` = "none")) +
# # theme_classic()
#
# #### that cluster 14 - scores between Tfh1/TFh1CM/Tfh2 are VERY VERY CLOSE...
# # Tfh1CM: 4274, Tfh2: 4474
####Supplementary Figure 4 G /| Tfh cell cluster markers in each subset for manual adjustment of “low-confidence” predicted cluster identities.
DefaultAssay(Tfh.IBSM) <- "RNA"
Tfh.mixed2 <- subset(Tfh.IBSM, subset = integrated_snn_res.2.2 %in% c(14,18,19,21,20,22))
Tfh.mixed2@meta.data$integrated_snn_res.2.2 <- factor(Tfh.mixed2@meta.data$integrated_snn_res.2.2,
levels = c(14,18,20,22,19,21))
#pdf("TfhIBSM_mixedTfhs_clustercomparison_vlnplot_070824.pdf", h =8, w = 4)
viol_colors = c("#2EA174","#4594D0","#2C4B9F","#3642A3", "#1E4DA8", "#425BB0")
Stacked_VlnPlot(Tfh.mixed2,
features = c("CXCR3", "GZMK", "CST7", "NKG7", "CCR7", "SELL", "MAF", "TOX2", "TCF7", "CCR6", "BHLHE40"),
x_lab_rotate = TRUE,colors_use = viol_colors,
group.by = "integrated_snn_res.2.2")
metadata <- as.data.frame(Tfh.IBSM.2@meta.data)
try <- metadata[, c("seurat_clusters", "day", "donor_id", "alternative_clusters")]
try.5 <- try %>%
group_by(alternative_clusters, donor_id, day) %>%
summarise(n = n(), .groups = "drop") %>%
group_by(donor_id, day) %>%
mutate(freq = n / sum(n))
## Add order before plotting
try.5$donor_id <- factor(try.5$donor_id, levels = c("donor0", "donor1", "donor2", "donor3"))
try.5$day <- factor(try.5$day, levels = c("day0", "day8", "day16", "day36"))
try.5$alternative_clusters <- factor(try.5$alternative_clusters,
levels = c("Tfh1", "Tfh1_CM", "Tfh2", "Tfh17","Tfh1.17",
"Tfreg", "IFNI", "heatshock", "ribo",
"cluster2", "cluster3"))
##Only clusters showing
try.5_subset <- try.5 %>%
filter(alternative_clusters %in% c("Tfreg", "IFNI", "heatshock", "ribo",
"cluster2", "cluster3"))
ggplot(data = try.5_subset, aes(x=day, y=freq)) +
geom_line(aes(group = donor_id, colour = donor_id)) +
geom_point(aes(group = donor_id, colour = donor_id)) +
geom_boxplot(data = try.5_subset, aes(x = day, y = freq), fill = "grey", width = 0.5, outlier.shape = NA) +
facet_wrap(~alternative_clusters, ncol = 3) +
theme_classic() +
scale_colour_manual(values = c("#d53e4f", "#3288bd", "#1a9850", "#c2a5cf"))
setwd(dirname(rstudioapi::getActiveDocumentContext()$path)) # This only works if you are using RStudio
file.names <- list.files(pattern = 'filtered_contig_annotations.csv', recursive = TRUE)
names <- list.files(path='VDJ')
file_seq <- 1:length(file.names)
if(any(grepl("*.csv", file.names))==TRUE){
ibsm_tcr <- file.names %>%
lapply(., function(i){
fread(input = i)
}) %>%
setNames(names)
}
ibsm_tcr <- ibsm_tcr[mixedsort(names(ibsm_tcr))]
ibsm_tcr <- ibsm_tcr %>%
map2(., file_seq, ~mutate(.x, barcode = sub(1, .y, barcode)))
names <- names(ibsm_tcr)
ibsm_tcr <- combineTCR(ibsm_tcr,
samples = names,
removeNA = TRUE,
removeMulti = TRUE,
filterMulti = TRUE)
## Now we need to edit the barcode, delete everything before "v3_"
ibsm_tcr <- ibsm_tcr %>%
map(~.x %>% mutate(barcode = gsub(".*VDJ_","",barcode)))
ibsm_tcr_db0 <- lapply(names(ibsm_tcr), function(name) {
df <- ibsm_tcr[[name]]
df$day <- name
colnames(df)[1] <- "barcode"
colnames(df)[2] <- "sample"
colnames(df)[3] <- "TCR1"
colnames(df)[4] <- "cdr3_aa1"
colnames(df)[5] <- "cdr3_nt1"
colnames(df)[6] <- "TCR2"
colnames(df)[7] <- "cdr3_aa2"
colnames(df)[8] <- "cdr3_nt2"
colnames(df)[9] <- "CTgene"
colnames(df)[10] <- "CTnt"
colnames(df)[11] <- "CTaa"
colnames(df)[12] <- "CTstrict"
return(df)
})
ibsm_tcr_db = as.data.frame(do.call(rbind, ibsm_tcr_db0))
write.csv(ibsm_tcr.db, "/working_groups/boylelab/shared/ZulyPava/Tfh_diversity_finalcode/input/ibsm40_tcr_db_30325.csv")
##reading seurat object
ibsm <- readRDS("/working_groups/boylelab/shared/ZulyPava/Tfh_diversity_TCR_4.4/IBSM40/Tfh.IBSM.2_REANNOTATION_POSTNORM_V2_200724.rds")
ibsm_rep <- combineExpression(ibsm_tcr, ibsm,
cloneCall="aa",
proportion = FALSE,
cloneSize=c(Single=1, Small=5, Medium=20, Large=100, Hyperexpanded=500))
ibsm_rep@meta.data$seurat_clusters <- factor(ibsm_rep@meta.data$seurat_clusters,
levels = c("Tfh1", "Tfh1_CM", "Tfh2", "Tfh17", "Tfreg", "IFNI",
"heatshock", "ribo", "cluster2", "cluster3"))
ibsm_rep@meta.data$alternative_clusters <- factor(ibsm_rep@meta.data$alternative_clusters,
levels = c("Tfh1", "Tfh1_CM", "Tfh2", "Tfh17", "Tfreg", "IFNI",
"heatshock", "ribo", "cluster2", "cluster3"))
##Determine clone order
ibsm_rep@meta.data <- ibsm_rep@meta.data %>%
mutate(cloneN = case_when(clonalFrequency == 1 ~ "1",
clonalFrequency == 2 ~ "2",
clonalFrequency == 3 ~ "3",
clonalFrequency == 4 ~ "4",
clonalFrequency > 5 ~ ">5")) %>%
mutate(cloneN = factor(cloneN, levels = c('1','2','3','4','>5')))
ibsm_rep@meta.data <- ibsm_rep@meta.data %>%
mutate(cloneN2 = case_when(clonalFrequency == 1 ~ "Clone # (N=1)",
clonalFrequency > 1 ~ "Clone # (N>1)")) %>%
mutate(cloneN2 = factor(cloneN2, levels = c('Clone # (N=1)','Clone # (N>1)')),
day = factor(day, levels = c("day0", "day8", "day16", "day36")))
cluster_col <- c("Tfh1" = "#9e0142", "Tfh1_CM" = "#f4a582", "Tfh2" = "#5e4fa2",
"Tfh17" = "#66c2a5", "Tfreg" = "#74add1",
"IFNI" = "#8c510a", "heatshock" = "#bf812d", "ribosomal" = "#fee090",
"cluster2" = "#01665e", "cluster3" = "#b2abd2")
cluster_orders <- c("Tfh1", "Tfh1_CM", "Tfh2", "Tfh17", "Tfreg", "IFNI", "heatshock", "ribo", "cluster2", "cluster3")
donor_col <- c("donor0"="#d53e4f", "donor1"="#3288bd", "donor2"="#1a9850", "donor3" = "#c2a5cf")
cell.prop <- ibsm_rep@meta.data %>%
mutate(TCR = case_when(is.na(CTaa) == TRUE ~ "No",
TRUE ~ "Yes"))
cell.prop <- cell.prop %>%
group_by(alternative_clusters, TCR) %>%
summarise(n = n()) %>%
group_by(alternative_clusters) %>%
mutate(total = sum(n)) %>%
mutate(prop = n/total*100)
ggplot(cell.prop, aes(x = alternative_clusters, y = prop, fill = TCR))+
geom_col(color = "black")+
theme_bw()+
ylab("Proportion")+
geom_text(aes(label = round(prop, digits = 1)), position = position_stack(vjust = 0.5))+
theme(axis.title.x = element_blank(),
axis.text.x = element_text(angle = 90, vjust = 0.5, hjust = 1))+
scale_fill_manual(values = c("grey","white"))
#### Supplementary Figure 5 B | Clonal size of other Tfh cell subsets
identified over time.
ibsm_rep_subC <- subset(ibsm_rep, subset = alternative_clusters %in% c("Tfreg", "IFNI", "heatshock", "ribo", "cluster2", "cluster3"))
ggplot(filter(ibsm_rep_subC@meta.data, !is.na(cloneN)),
aes(x = alternative_clusters, fill = cloneN))+
geom_bar(colour = "black", position = position_stack(reverse = TRUE), linewidth = 0.3)+
# geom_text(stat='count', aes(label= ..count..), vjust=-1)+
# scale_x_discrete(limits = clone_order)+
theme_classic()+
# scale_fill_brewer(palette = "GnBu", direction = 1)+
scale_fill_manual(values = c("grey","#ABDDA4", "#66C2A5", "#3288BD", "#5E4FA2"))+
labs(y = "Count")+
theme(axis.title.x = element_blank(),
axis.text.x = element_text(angle=90))+
facet_wrap(~day, nrow = 1)
set.seed(123)
ibsm_rep@meta.data$day = factor(ibsm_rep@meta.data$day, levels = c('day0','day8','day16','day36'))
ibsm_rep@meta.data$donor_day = factor(ibsm_rep@meta.data$donor_day,
levels = c('donor0_day0','donor0_day8','donor0_day16','donor0_day36',
'donor1_day0','donor1_day8','donor1_day16','donor1_day36',
'donor2_day0','donor2_day16','donor2_day36',
'donor3_day0','donor3_day8','donor3_day16','donor3_day36'))
ibsm_repMon <- subset(ibsm_rep, subset = clonalFrequency != 1)
ibsm_repMon <- subset(ibsm_repMon, subset = donor_day != 'donor2_day8')
ibsm_overlap <- clonalOverlap(ibsm_repMon, cloneCall = "aa",
method = "jaccard", group.by = "donor_day", exportTable = F, palette = "plasma")+
theme(axis.title = element_blank(),
#axis.text = element_blank(),
axis.text.x = element_text(angle = 90),
axis.ticks = element_blank()
)
meta <- ibsm_rep@meta.data %>%
distinct(donor_day, donor_id, day)
meta$day = factor(meta$day, levels = c('day0','day8','day16','day36'))
meta$donor_day = factor(meta$donor_day, levels = c('donor0_day0','donor0_day8','donor0_day16','donor0_day36',
'donor1_day0','donor1_day8','donor1_day16','donor1_day36',
'donor2_day0','donor2_day16','donor2_day36',
'donor3_day0','donor3_day8','donor3_day16','donor3_day36'))
donor_labs <- meta %>%
filter(!is.na(donor_id)) %>%
filter(!is.na(donor_day)) %>%
ggplot(aes(donor_day, y = 1, fill = donor_id)) +
geom_tile() +
scale_fill_brewer(palette = 'Set1', name = "Donor ID") +
theme_void()
day_labs <- meta %>%
filter(!is.na(day)) %>%
filter(!is.na(donor_day)) %>%
ggplot(aes(donor_day, y = 1, fill = day)) +
geom_tile() +
scale_fill_brewer(palette = 'Set3', name = "Day") +
theme_void()
final_plot <- day_labs / donor_labs / ibsm_overlap +
patchwork::plot_layout(heights = c(0.05, 0.05, 1), guides = 'collect')
print(final_plot)
ibsm_rep <- subset(ibsm_rep, subset = donor_day != 'donor2_day8')
# Create a new column day_clust for ChordDiagram "sector"
ibsm_rep@meta.data$day_clust <- paste(ibsm_rep@meta.data$day,
ibsm_rep@meta.data$alternative_clusters, sep="_")
donors <- mixedsort(unique(ibsm_rep@meta.data$donor_id))
for (i in donors){
# Create circos df using getCirclize function
ibsm_circos <- getCirclize(subset(ibsm_rep, subset = donor_id == i),
group.by = "day_clust") %>% filter(., value!=0)
# Metadata on naming the group
df_names <- filter(ibsm_rep@meta.data, donor_id == i) %>%
select(alternative_clusters, day, day_clust) %>%
distinct(day_clust, .keep_all = TRUE) %>%
remove_rownames() %>%
arrange(., factor(alternative_clusters))
# We need to match color day_clust variables based on cell type
# Currently done manually, need to find easier way
names <- unique(c(ibsm_circos$from, ibsm_circos$to))
grid.col <- case_when(grepl("_Tfh1$", names) ~ "#9e0142",
grepl("_Tfh1_CM$", names) ~ "#f4a582",
grepl("Tfh2", names) ~ "#5e4fa2",
grepl("Tfh17", names) ~ "#66c2a5",
grepl("Tfreg", names) ~ "#74add1",
grepl("IFNI", names) ~ "#8c510a",
grepl("heatshock", names) ~ "#bf812d",
grepl("ribo", names) ~ "#fee090",
grepl("_cluster2$", names) ~ "#01665e",
grepl("_cluster3$", names) ~ "#b2abd2",
TRUE ~ NA)
names(grid.col) <- names
# We want to split the circos based on Day grouping
group = structure(df_names$day, names = df_names$day_clust)
group = factor(group, levels = mixedsort(unique(group)))
# Now plot backbone of circos
chordDiagram(ibsm_circos,
grid.col = grid.col,
annotationTrack = "grid",
preAllocateTracks = list(list(track.height = 0.075),
# # list(track.height = 0.075),
list(track.height = 0.001)),
annotationTrackHeight = mm_h(5),
group = group,
big.gap = 20, small.gap = 1,
direction.type = c("diffHeight", "arrows"))
title(i)
# Now manually name the inner sector based on cell type
names <- data.frame(day_clust = get.all.sector.index()) %>%
left_join(., df_names, by = "day_clust")
names <- names$alternative_clusters
for(i in seq_along(names)){
si <- get.all.sector.index()[i]
nm <- names[i]
xlim = get.cell.meta.data("xlim", sector.index = si, track.index = 2)
ylim = get.cell.meta.data("ylim", sector.index = si, track.index = 2)
circos.text(mean(xlim), mean(ylim), nm, sector.index = si, track.index = 2,
facing = "outside", niceFacing = TRUE, col = "black", cex = 0.5)
}
# Now plot outer ribbon on days
# highlight.sector(filter(df_names, day == 'day0')$day_clust, track.index = 1, col = viridis(n=4)[1], text = "Day 0", cex = 0.75, text.col = "white", niceFacing = TRUE)
# highlight.sector(filter(df_names, day == 'day8')$day_clust, track.index = 1, col = viridis(n=4)[2], text = "Day 8", cex = 0.75, text.col = "white", niceFacing = TRUE)
# highlight.sector(filter(df_names, day == 'day16')$day_clust, track.index = 1, col = viridis(n=4)[3], text = "Day 16", cex = 0.75, text.col = "white", niceFacing = TRUE)
# highlight.sector(filter(df_names, day == 'day36')$day_clust, track.index = 1, col = viridis(n=4)[4], text = "Day 36", cex = 0.75, text.col = "white", niceFacing = TRUE)
for (j in unique(group)){
highlight.sector(filter(df_names, day == j)$day_clust, track.index = 1, , border = "black", col = "white", text = j, cex = 0.75, text.col = "black", niceFacing = TRUE)
}
circos.clear()
}
dev.set()
## png
## 2
TfhIBSM <- readRDS("/working_groups/boylelab/shared/ZulyPava/Tfh_diversity_finalcode/input/Tfh.IBSM.2_REANNOTATION_POSTNORM_V2_200724.rds")
Tfh_colors_list <- c("#9e0142", "#f4a582", "#5e4fa2", "#66c2a5", "#74add1",
"#8c510a", "#bf812d", "#fee090", "#01665e", "#b2abd2")
TfhIBSM@meta.data$alternative_clusters <- factor(TfhIBSM@meta.data$alternative_clusters,
levels = c("Tfh1", "Tfh1_CM", "Tfh2", "Tfh17","Tfh1.17",
"Tfreg", "IFNI", "heatshock", "ribo",
"cluster2", "cluster3"))
# Pull out just D16 cells and see..
Tfh.16 <- subset(TfhIBSM, subset = day == "day16")
#DefaultAssay(Tfh.16)
# to check if lognorm has been performed... or alternatively just run each counts/data separately, logorm will be non-integers ie 1.xxx
#Tfh.16[['RNA']]@counts@x == Tfh.16[['RNA']]@data@x
# Activation genes
Stacked_VlnPlot(Tfh.16, features = c("MKI67", "CD38", "ICOS"), x_lab_rotate = TRUE,
group.by = "alternative_clusters", colors_use = Tfh_colors_list)
#### Supplementary Figure 6 B and C /| Despite upregulation of genes
associated with inflammation, expression profiles were still different
between Tfh1_cyto and Tfh1_CCR7 subsets. Tfh cell subsets maintained
expression profiles of identifying cluster markers.
# Tfh1 genes..
Stacked_VlnPlot(Tfh.16, features = c("GZMK", "NKG7", "CCL5", "CXCR3"),
x_lab_rotate = TRUE,
group.by = "alternative_clusters", colors_use = Tfh_colors_list)
# TCM/Th2 genes...
Stacked_VlnPlot(Tfh.16, features = c("SELL", "CCR7", "TIGIT", "SH2D1A", "TCF7", "MAF"),
x_lab_rotate = TRUE,
group.by = "alternative_clusters", colors_use = Tfh_colors_list)
###### Getting data for Supplementary Figure 6D
TfhIBSM <- readRDS("/working_groups/boylelab/shared/ZulyPava/Tfh_diversity_finalcode/input/Tfh.IBSM.2_REANNOTATION_POSTNORM_V2_200724.rds")
TfhIBSM@meta.data$alternative_clusters <- factor(TfhIBSM@meta.data$alternative_clusters,
levels = c("Tfh1", "Tfh1_CM", "Tfh2", "Tfh17",
"Tfh1.17", "Tfreg", "IFNI",
"heatshock", "ribo", "cluster2",
"cluster3"))
## first we want to get a pseudobulk count matrix..
# renaming cluster for ease.. (you can skip this if you don't need)
TfhIBSM@meta.data$alternative_clusters <- gsub("Tfh1_CM", "Tfh1CM", TfhIBSM@meta.data$alternative_clusters, fixed = TRUE)
TfhIBSM@meta.data$donor_day_cluster <- paste(TfhIBSM@meta.data$donor_day, TfhIBSM@meta.data$alternative_clusters, sep = "_")
TfhIBSM@meta.data$donor_day_cluster <- factor(TfhIBSM@meta.data$donor_day_cluster)
# now getting pseudobulk counts by the combination I want: donor_day_cluster
DefaultAssay(TfhIBSM) <- "RNA"
TfhIBSM<- NormalizeData(TfhIBSM)
# this is right... you aggregate using raw counts...
TfhIBSM.Average <- AggregateExpression(TfhIBSM, group.by = "donor_day_cluster", slot = "counts", assays = "RNA", return.seurat = T)
## Names of identity class contain underscores ('_'), replacing with dashes ('-')
## Centering and scaling data matrix
##
## This message is displayed once every 8 hours.
### tidying up metadata naming
TfhIBSM.Average@meta.data$donor_day_cluster <- as.factor(rownames(TfhIBSM.Average@meta.data))
# Split name column into firstname and last name
TfhIBSM.Average@meta.data[c("donor", "day", "seurat_clusters")] <- str_split_fixed(TfhIBSM.Average@meta.data$donor_day_cluster, "-", 3)
# Now extract the pseudobulked log-counts (NON-SCALED)
TfhIBSM.scalemat <- GetAssayData(TfhIBSM.Average, layer = "data") # data is already log-normalised..
## put in metadata
TfhIBSM.meta <- as.data.frame(TfhIBSM.Average@meta.data)
TfhIBSM.scalemat <- as.data.frame(TfhIBSM.scalemat)
TfhIBSM.scalemat <- as.data.frame(t(TfhIBSM.scalemat))
TfhIBSM.scalemat$donor_day_cluster<- rownames(TfhIBSM.scalemat)
Average.df <- merge(x = TfhIBSM.scalemat, y = TfhIBSM.meta, by = "donor_day_cluster")
# Average.df <- read.csv("TfhIBSM_Averagemat_LOGNORMALISED_donordaycluster_V2_090824.csv", row.names = 1)
rownames(Average.df) <- Average.df$donor_day_cluster
Average.df$day <- factor(Average.df$day, levels = c("day0", "day8", "day16", "day36"))
Average.df$seurat_clusters <- factor(Average.df$seurat_clusters, levels = c("Tfh1", "Tfh1CM", "Tfh2", "Tfh17",
"Tfreg", "IFNI",
"heatshock", "ribo", "cluster2",
"cluster3"))
# ### subset
# Average.dfsub <- Average.df[Average.df$day == "day16", ]
# Average.dfsub <- Average.dfsub[Average.dfsub$seurat_clusters %in% c("Tfh1", "Tfh1CM", "Tfh2"), ]
#
#
# #### okay now to reorder EVERYTHING...
#
# my_order <- c("donor0-day16-Tfh1", "donor1-day16-Tfh1", "donor2-day16-Tfh1", "donor3-day16-Tfh1",
# "donor0-day16-Tfh1CM", "donor1-day16-Tfh1CM", "donor2-day16-Tfh1CM", "donor3-day16-Tfh1CM",
# "donor0-day16-Tfh2", "donor1-day16-Tfh2", "donor2-day16-Tfh2", "donor3-day16-Tfh2")
#
# Average.dfsub <- Average.dfsub[match(my_order, rownames(Average.dfsub)), ]
#
# ### now reoreder everything else I want as well...
# Average.dfsub$seurat_clusters <- factor(Average.dfsub$seurat_clusters, levels = c("Tfh1","Tfh1CM", "Tfh2"))
# Average.dfsub$donor <- factor(Average.dfsub$donor, levels = c("donor0", "donor1", "donor2", "donor3"))
upset0 <- read.csv(file="/working_groups/boylelab/shared/ZulyPava/Tfh_diversity_finalcode/sigGenes/psbulk_sigGens_all_DayClusters_080824.csv", row.names = 1)
##Getting dataset
##Input
## Long format: gene, pval, logFC, cluster, day
## From pseudobulk analysis subset of genes with p_adj <0.005 and
## LogFC>abs(1)
upset1 <- upset0[,c(3,8,4,1,2)]
#Change colnames
colnames(upset1)[4] ="celltype"
colnames(upset1)[3] ="Log2FC"
colnames(upset1)[2] ="p_adj"
#reorder...
upset1$day <- factor(upset1$day, levels = c("day8", "day16", "day36"))
upset1$celltype <- factor(upset1$celltype, levels = c("Tfh1", "Tfh1CM", "Tfh2", "Tfh17", "Tfreg", "IFNI", "heatshock", "ribo", "cluster2", "cluster3"))
### subset
Average.dfsub <- Average.df[Average.df$day %in% c("day0", "day16"), ]
table(Average.dfsub$day)
##
## day0 day8 day16 day36
## 40 0 40 0
table(Average.dfsub$seurat_clusters)
##
## Tfh1 Tfh1CM Tfh2 Tfh17 Tfreg IFNI heatshock ribo
## 8 8 8 8 8 8 8 8
## cluster2 cluster3
## 8 8
Average.dfsub <- Average.dfsub[Average.dfsub$seurat_clusters %in% c("Tfh1", "Tfh1CM", "Tfh2"), ]
table(Average.dfsub$seurat_clusters)
##
## Tfh1 Tfh1CM Tfh2 Tfh17 Tfreg IFNI heatshock ribo
## 8 8 8 0 0 0 0 0
## cluster2 cluster3
## 0 0
#### okay now to reorder EVERYTHING...
my_order <- c("donor0-day0-Tfh1", "donor1-day0-Tfh1", "donor2-day0-Tfh1", "donor3-day0-Tfh1",
"donor0-day16-Tfh1", "donor1-day16-Tfh1", "donor2-day16-Tfh1", "donor3-day16-Tfh1",
"donor0-day0-Tfh1CM", "donor1-day0-Tfh1CM", "donor2-day0-Tfh1CM", "donor3-day0-Tfh1CM",
"donor0-day16-Tfh1CM", "donor1-day16-Tfh1CM", "donor2-day16-Tfh1CM", "donor3-day16-Tfh1CM",
"donor0-day0-Tfh2", "donor1-day0-Tfh2", "donor2-day0-Tfh2", "donor3-day0-Tfh2",
"donor0-day16-Tfh2", "donor1-day16-Tfh2", "donor2-day16-Tfh2", "donor3-day16-Tfh2")
Average.dfsub <- Average.dfsub[match(my_order, rownames(Average.dfsub)), ]
### now reoreder everything else I want as well...
Average.dfsub$seurat_clusters <- factor(Average.dfsub$seurat_clusters, levels = c("Tfh1","Tfh1CM", "Tfh2"))
Average.dfsub$donor <- factor(Average.dfsub$donor, levels = c("donor0", "donor1", "donor2", "donor3"))
Average.dfsub$day <- factor(Average.dfsub$day, levels = c("day0", "day16"))
### subset gene lists...
colnames(upset1)[3] <- "logFC"
Tfh1CMgenes <- upset1%>%filter(celltype=="Tfh1CM", day== "day16")
Tfh1genes <- upset1%>%filter(celltype=="Tfh1", day== "day16")
Tfh2genes <- upset1%>%filter(celltype=="Tfh2", day== "day16")
# edgeR was done - D16 v D0, so logFC< 0 = up at d16, log FC >0 = down at d16, but up at day 0
Tfh1CMgenes$dir_sig[Tfh1CMgenes$p_adj<0.05 & Tfh1CMgenes$logFC<0] <-"up"
Tfh1CMgenes$dir_sig[Tfh1CMgenes$p_adj<0.05 & Tfh1CMgenes$logFC>0] <-"down"
Tfh1genes$dir_sig[Tfh1genes$p_adj<0.05 & Tfh1genes$logFC<0] <-"up"
Tfh1genes$dir_sig[Tfh1genes$p_adj<0.05 & Tfh1genes$logFC>0] <-"down"
Tfh2genes$dir_sig[Tfh2genes$p_adj<0.05 & Tfh2genes$logFC<0] <-"up"
Tfh2genes$dir_sig[Tfh2genes$p_adj<0.05 & Tfh2genes$logFC>0] <-"down"
# let's get D16up - top 50
Tfh1CMgenes$logFC <- Tfh1CMgenes$logFC*-1
Tfh1CMgenes <- Tfh1CMgenes[order(Tfh1CMgenes$logFC, decreasing = TRUE),]
topCM <- head(Tfh1CMgenes, n=20)
Tfh1genes$logFC <- Tfh1genes$logFC*-1
Tfh1genes <- Tfh1genes[order(Tfh1genes$logFC, decreasing = TRUE),]
top1 <- head(Tfh1genes, n=20)
Tfh2genes$logFC <- Tfh2genes$logFC *-1
Tfh2genes <- Tfh2genes[order(Tfh2genes$logFC, decreasing = TRUE),]
top2 <- head(Tfh2genes, n=20)
# Extract gene columns and merge
merged_genes <- unique(c(topCM$gene, top1$gene, top2$gene))
# Ensure only the top 50 unique genes
merged_genes <- unique(merged_genes[1:50])
dfsub_16<- Average.dfsub[, colnames(Average.dfsub) %in% merged_genes]
dfsub_16 <- as.matrix(dfsub_16)
dfsub_16 = scale(dfsub_16)
col = list(Cluster = c("Tfh1" = "#b2182b", "Tfh1CM" = "#fbb4ae", "Tfh2" = "#1f78b4"),
Donor = c("donor0" = "#cb7a88", "donor1" = "#ccab7b", "donor2" = "#a3cb7a", "donor3" = "#7cbecc"),
Day = c("day0" = "#bababa", "day16" = "#fdae61"))
ha <- HeatmapAnnotation(Cluster=Average.dfsub$seurat_clusters,
Day=Average.dfsub$day,
Donor=Average.dfsub$donor,
col = col)
Heatmap(t(dfsub_16),
name = "RNAseq", col=mako(50),
top_annotation = ha,
cluster_rows = TRUE, cluster_columns = FALSE,
column_title = "Genes", row_title = "Samples",
row_names_gp = gpar(fontsize=6),
column_names_gp = gpar(fontsize=4))
xfun::session_info(c("plyr", "tidyr", "dplyr", "Seurat", "patchwork", "ggplot2", "sctransform",
"BiocManager", "limma", "scCustomize", "qs", "viridis", "tidyverse",
"ggpubr", "scRepertoire", "data.table", "gtools", "ggraph", "tibble",
"circlize", "ggalluvial", "RColorBrewer", "rstackdeque", "ComplexHeatmap",
"harmony", "openxlsx", "HGNChelper", "pheatmap", "forcats", "ggthemes",
"ggrepel", "cowplot", "ggfortify", "gplots", "reshape2", "rstatix", "eulerr"), dependencies = FALSE)
## Registered S3 method overwritten by 'eulerr':
## method from
## plot.venn gplots
## R version 4.4.1 (2024-06-14)
## Platform: x86_64-pc-linux-gnu
## Running under: Ubuntu 22.04.4 LTS
##
## Locale:
## LC_CTYPE=en_US.UTF-8 LC_NUMERIC=C
## LC_TIME=en_US.UTF-8 LC_COLLATE=en_US.UTF-8
## LC_MONETARY=en_US.UTF-8 LC_MESSAGES=en_US.UTF-8
## LC_PAPER=en_US.UTF-8 LC_NAME=C
## LC_ADDRESS=C LC_TELEPHONE=C
## LC_MEASUREMENT=en_US.UTF-8 LC_IDENTIFICATION=C
##
## Package version:
## BiocManager_1.30.25 circlize_0.4.16 ComplexHeatmap_2.20.0
## cowplot_1.1.3 data.table_1.16.0 dplyr_1.1.4
## eulerr_7.0.2 forcats_1.0.0 ggalluvial_0.12.5
## ggfortify_0.4.17 ggplot2_3.5.1 ggpubr_0.6.0
## ggraph_2.2.1 ggrepel_0.9.6 ggthemes_5.1.0
## gplots_3.2.0 gtools_3.9.5 harmony_1.2.3
## HGNChelper_0.8.15 limma_3.60.6 openxlsx_4.2.8
## patchwork_1.3.0 pheatmap_1.0.12 plyr_1.8.9
## qs_0.27.3 RColorBrewer_1.1-3 reshape2_1.4.4
## rstackdeque_1.1.1 rstatix_0.7.2 scCustomize_3.0.1
## scRepertoire_2.0.7 sctransform_0.4.1 Seurat_5.2.1
## tibble_3.2.1 tidyr_1.3.1 tidyverse_2.0.0
## viridis_0.6.5